!pip install -r requirements.txt
import importlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from IPython.display import display, HTML, clear_output
from cartoframes.contrib import vector
from cartoframes import Credentials, CartoContext, QueryLayer
import cf
from cf.do import qk2mercatorpolygon
import pandas as pd
import seaborn as sns
import numpy as np
import missingno as msno
import re
from analysis.twin import draw_boxplot_df, draw_hist, draw_corrm
import ipywidgets as widgets
def view_variable(changed):
clear_output()
display(dropdown)
variable = changed['new']
color = f'ramp(prop("{variable}"), Temps)'
display(cf.viz.map(nyc_wework_locations_ta_augmented_gdf,
cc,
color=color,
legend=variable,
interactivity=variable))
nyc_wework_locations_ta_augmented_gdf[variable].plot.hist()
def entities2df(entities):
return pd.DataFrame.from_records([o.to_record() for o in entities])
WeWork is a company that provides shared office spaces. Currently WeWork offers 58 offices in NYC and only 21 offices in LA. Given a WeWork office location in NYC we want to combine the different data sources stored in CARTO's Data Observatory (DO) to find locations in LA that show similar characteristics. The results of this similarity analysis will indicate the best spots where WeWork should plan opening a new office / relocating an existing office with similar characteristics with respect to the target location in NYC.
The notebook is organized as follows:
cf.do.grid_cells(polygon, variables, zoom) --> from a target polygon retrieve the grid cells enriched with data from the DO (at a given zoom level)
cf.do.augment(polygons,variables, zoom) --> enrich each polygon with the data from the DO (at a given zoom level)
cf.load.grid_data(input_file, fields, zooms) --> interpolate custom data to the the DO common DO grid (at a given zoom level). Here 'fields' need both the variable name and the interpolation function required, e.g. fields={'txs_count': 'sum'}.
We setup the CARTOFrames credentials and context.
creds = Credentials(username='', key='')
cc = CartoContext(creds=creds)
regions = cf.catalog.list_regions(search_term='US')
regions_df = entities2df(regions)
cf.viz.map(regions_df, cc,color='opacity(red, .5)')
spatial_resolutions_ny = entities2df(regions[1].list_spatial_resolutions())
spatial_resolutions_ny['region'] = 'NY'
spatial_resolutions_la = entities2df(regions[2].list_spatial_resolutions())
spatial_resolutions_la['region'] = 'LA'
print('Available spatial resolutions:')
spatial_resolutions = pd.merge(
spatial_resolutions_ny[['description','region']],
spatial_resolutions_la[['description','region']],
on='description',
how='outer')
spatial_resolutions['region'] = spatial_resolutions[['region_x', 'region_y']].values.tolist()
spatial_resolutions.drop(['region_x', 'region_y'], axis=1, inplace=True)
spatial_resolutions
ny = regions[1]
spatial_resolutions = ['305 meters']
providers = ny.list_providers(spatial_resolutions=spatial_resolutions)
providers_df = entities2df(providers)
providers_df[['description','id','name']].sort_values(by = 'id')
Example 1: POIs from Pitney Bowes tagged as food.
selected_variables = ['Food']
selected_poi_attributes = cf.catalog.list_attributes(regions=['NY'], spatial_resolutions=spatial_resolutions, tags=selected_variables)
selected_poi_attributes = pd.DataFrame.from_records([o.__dict__ for o in selected_poi_attributes])
selected_poi_attributes['spatial_resolutions'] = spatial_resolutions[0]
selected_poi_attributes[['description','source_description','data_since','data_until','temporal_resolutions','agg']].head()
Example 2: Population and income data from the DO.
selected_variables = ['Population', 'Income']
selected_demographics_attributes = cf.catalog.list_attributes(regions=['NY'], spatial_resolutions=spatial_resolutions, tags=selected_variables)
selected_demographics_attributes = pd.DataFrame.from_records([o.__dict__ for o in selected_demographics_attributes])
selected_demographics_attributes['spatial_resolutions'] = spatial_resolutions[0]
selected_demographics_attributes[['description','source_description','data_since','data_until','temporal_resolutions','agg']].head()
query = "SELECT * FROM wework_buildings WHERE primary_market = 'New York City'"
wework_nyc_locations_df = cc.query(query, decode_geom=True)
wework_nyc_locations_df[['name', 'address', 'is_open','min_office_price']].head(10)
zoom = 17
variables = cf.catalog.list_variables(categories='all')
selected_variables = []
for v in variables:
if v.category != 'es_demographics' \
and v.category != 'es_mobility' \
and v.category != 'es_financial' \
and v.category != 'es_traffic' \
and v.category != 'es_poi':
selected_variables.append(v)
nyc_wework_locations_ta_gdf = cf.lds.isochrone(wework_nyc_locations_df,
mode='walk', time=600)
nyc_wework_locations_ta_augmented_gdf = cf.do.augment(nyc_wework_locations_ta_gdf,
variables=selected_variables, z=zoom)
initial_variable = selected_variables[0].id
dropdown = widgets.Dropdown(
options=[v.id for v in selected_variables],
value=initial_variable,
description='Variable'
)
dropdown.observe(view_variable, names='value')
view_variable({'new': initial_variable})
Based on the existing NYC WeWork locations, we want to find locations in LA that are characterized by "similar" values of the selected variables. The general approach is to compute the one-to-one distance in variable space
\begin{equation*} d\left(\mathbf{Y}_{origin}(i),\mathbf{Y}_{target}(f)\right) = \sqrt{\sum_j\left(Y_{origin}(i){_j} -Y_{target}(f){_j}\right)^2} \end{equation*}
between a selected origin cell ($i$) and each target cell ($f$), where the index $j$ identifies the variable type (e.g. total population).
First, we define the spatial resolution of the grid used for the analysis.
zoom = 17
These are user defined variables, which will depend on the particular application. For example, for this use case, we do not want to include the road type-related POIs, otherwise the results may be infuenced by the different road plans for LA and NY.
variables = cf.catalog.list_variables(categories='all')
selected_variables = []
for v in variables:
if v.category != 'es_demographics' \
and v.category != 'es_mobility' \
and v.category != 'es_financial' \
and v.category != 'es_traffic' \
and v.category != 'es_poi':
if v.category == 'mobility_emodo':
if re.search('emd_all_all', v.name):
selected_variables.append(v)
if v.category == 'demographics':
selected_variables.append(v)
elif v.category == 'poi':
if not re.search('highway', v.name):
selected_variables.append(v)
elif v.category == 'financial':
if v.name[:3] in ('acc', 'ret','eap','gro','btn') \
and re.search('(spend_amt|trans_cnt|acct_cnt)$', v.name):
selected_variables.append(v)
pd.DataFrame(selected_variables, columns=['variables'])
origin_locations = wework_nyc_locations_df['geometry']
target_polygon = cf.catalog.named_entity('Los Angeles')
from shapely.geometry import box
cf.viz.map(target_polygon, cc, color='opacity(red, .2)')
target_polygon_variables = cf.do.grid_cells(target_polygon, selected_variables, zoom)
target_polygon_variables['geometry'] = [qk2mercatorpolygon(qk) for qk in target_polygon_variables.index]
variable_id = selected_variables[0].id
print(variable_id)
cf.viz.map(target_polygon_variables.filter(regex='^02301231113', axis=0), cc,
strokeWidth=.5, color=f'opacity(ramp(prop("{variable_id}"), Temps), .5)',
interactivity=variable_id)
To properly compute distances, we need to take into account few drawbacks.
what happens when the variables have different variances? It is clear that in this case that distances in total population will be given a different weight than the distances in the number of food POIs
what happens when there are correlations between variables? When there are correlated variables, there is a lot of redundant information in the computation of the distance. By considering the covariance between the points in the distance computation, we can remove that redundancy
how do we compute distances in the presence of missing data?
Note: the y-scale is logarithmic!
fig, axes = plt.subplots(figsize=(15, 7.5))
draw_boxplot_df(target_polygon_variables, axes, fig, [v.id for v in selected_variables])
plt.yscale("log")
This plot show the correlation matrix for the selected variables. Matrix elements with larger values imply larger correlations.
draw_corrm(target_polygon_variables[[v.id for v in selected_variables]])
We can then look at the proportion of missing data for each selected variable.
msno.bar(target_polygon_variables[[v.id for v in selected_variables]])
If a POI variable is missing for a given location we can assume that the number of POIs for that variable in that location is zero. On the other hand, for other variables (e.g. demographics) we need to deal with missing values.
\begin{equation*} d\left(\mathbf{Y}_{origin}(i),\mathbf{Y}_{target}(f)\right) = \sqrt{\sum_j\left(Y_{origin}(i){_j} -Y_{target}(f){_j}\right)^2} \end{equation*}
To account for DIFFERENT VARIANCES, the data are standardized such that they have zero mean and unit variance. This also implies that all variables are given the same weight when computing distances
To account for CORRELATED VARIABLES the distances are not computed in the original variable space but in the $\textbf{Principal Components Space (PCA)}$ where the variables are linearly uncorrelated. PCA is also used to reduce the noise in the data, by retaining only a subset of PCs. To account for the uncertainty in the number of retained PCs, we created an ensemble of distance computations and, for each ensemble member, we randomly set the number of retained PC components within the range defined by the number of PCs explaining respectively 90% and 100% of the variance. For more details on how PCA works see the box below
To account for MISSING DATA, here we are using a probabilistic approach to PCA, called $\textbf{Probabilistic PCA (PPCA)}$. In PPCA the complete data are modelled by a generative latent variable model which iteratively updates the expected complete data log-likelihood and the maximum likelihood estimates of the parameters. PPCA has also the advantage of being faster than PCA because it does not require to compute the eigen-decomposition of the data covariance matrix. For more details see the box below and http://www.jmlr.org/papers/volume11/ilin10a/ilin10a.pdf
Principal component analysis (PCA) is a technique to transform data sets of high dimensional vectors into lower dimensional ones. This is useful, for instance, for feature extraction and noise reduction. PCA finds a smaller-dimensional linear representation of data vectors such that the original data could be reconstructed from the compressed representation with the minimum square error.
The most popular approach to PCA is based on the eigen-decomposition of the sample covariance matrix
\begin{equation*} \mathbf{C} = \dfrac{1}{n} \, \mathbf{Y} \, \mathbf{Y}^T \end{equation*}
where $\mathbf{Y}$ is the centered (row-wise zero empirical mean) $n x d$ data matrix, where $d$ is the number of data points and $n$ the number of variables. After computing the eigenvector ($\mathbf{U}$) and eigenvalue ($\mathbf{D}$) matrix of the covariance matrix
\begin{equation*} \mathbf{C} = \mathbf{U} \, \mathbf{D} \, \mathbf{U}^T \end{equation*}
and rearranging their columns in order of decreasing eigenvalue, the principal components (PC or factors) are computed as
\begin{equation*} \mathbf{Z} = \mathbf{Y} \, \mathbf{P} \end{equation*}
where $P = \mathbf{U}_p $ represent the eigenvectors that correspond to the largest $p$ eigenvalues, i.e. to the largest amount of explained variance. The original (centered) data can then be reconstructed as
\begin{equation*} \mathbf{\hat{Y}} = \mathbf{Z} \, \mathbf{P}^T \end{equation*}
PCA can also be described as the maximum likelihood solution of a probabilistic latent variable model, which is known as PPCA:
\begin{equation*} Y_{ij} = \mathbf{P}_i \, \mathbf{Z}_j + m_i + \varepsilon_{ij} \quad i = 1, .., d; \, j = 1, .., n \end{equation*}
with
\begin{align} p(\mathbf{Z}_j) \sim N(0, \mathbb{1}) \\ p(\varepsilon_{ij}) \sim N(0, \nu) \end{align}
Both the principal components $Z$ and the noise $\varepsilon$ are assumed normally distributed. The model can be identified by finding the maximum likelihood (ML) estimate for the model parameters using the Expectation-Maximization (EM) algorithm by minimizing the mean-square error of the observed part of the data. EM is a general framework for learning parameters with incomplete data which iteratively updates the expected complete data log-likelihood and the maximum likelihood estimates of the parameters. In PPCA, the data are incomplete because the principal components, $Z_i$, are not observed and are treated as latent variables. When missing data are present, in the E-step, the expectation of the complete-data log-likelihood is taken with respect to the conditional distribution of the latent variables given the observed variables. In this case, the update EM rules are the following.
E-step: \begin{align} \mathbf{\Sigma}_{\mathbf{Z}_j} = \nu \left(\nu \, \mathbb{1} + \sum_i \mathbf{P}_i \mathbf{P}_i^T \right)^{-1} \\ \overline{\mathbf{Z}}_j = \dfrac{1}{\nu}\mathbf{\Sigma}_{\mathbf{Z}_j} \sum_i \mathbf{P}_i \left(Y_{ij}- m_i \right) \\ m_{i} = \dfrac{1}{n} \sum_j \left(Y_{ij} - \mathbf{P}_i^T \, \overline{\mathbf{Z}}_j \right) \\ \end{align}
M-step: \begin{align} \mathbf{P}_{i} = \left( \sum_j \overline{\mathbf{Z}}_j \overline{\mathbf{Z}}_j ^T + \mathbf{\Sigma}_{\mathbf{Z}_j} \right)^{-1} \sum_j \overline{\mathbf{Z}}_j \, \left(Y_{ij}- m_{ij} \right)\\ \nu = \dfrac{1}{n} \sum_{ij} \left[ \left(Y_{ij} - \mathbf{P}_i^T \, \overline{\mathbf{Z}}_j - m_i \right)^2 + \mathbf{P}_i^T \, \overline{\mathbf{Z}}_j \mathbf{P}_i \, \right] \end{align}
where each row of $\mathbf{P}$ and $\overline{\mathbf{Z}}$ is recomputed based only on those columns of $\overline{\mathbf{Z}}$ which contribute to the reconstruction of the observed values in the corresponding row of the data matrix.
similarity_predictors = selected_variables
similarity = cf.analysis.similarity(
origin_locations,
target_polygon,
similarity_predictors,
z=zoom
)
So far we computed distances in variable space between a given WeWork location in NY and each location in the target area in LA. We can then select the target locations with the smallest distances as the best candidates where to open/relocate WeWork offices in LA. However, how do we know that a distance is small enough, i. e. how to we define similarity?
To answer to this question, we defined a similarity skill score (SS) by comparing the score for each target location to the score from the mean vector data.
\begin{equation*} {SS}_{target}(f) = 1 - \dfrac{{S}_{target}(f)}{{S}_{target}(\widehat f)} \end{equation*}
where the score just represents the distance in variable space, i.e.
\begin{equation*} {S}_{target}(f) \equiv d\left(\mathbf{Y}_{origin}(i),\mathbf{Y}_{target}(f)\right) =: d \end{equation*}
If we account for the uncertainty in the computation of the distance for each target location via the ensemble generation, the score for each target location becomes
\begin{equation*} {S}_{target}(f) = \dfrac{1}{K} \sum_k d_k - \dfrac{1}{2K (K-1)} \sum_k \sum_l \left| d_k- d_l \right| \end{equation*}
where K is the number of ensemble members.
We can then order the target location in decreasing order of SS and retain only the results which satisfy a threshold condition (SS = 1 meaning perfect matching or zero distance). More information on scoring rule can be found here: https://rmets.onlinelibrary.wiley.com/doi/full/10.1002/qj.2270.
Having computed the one-to-one distances in variable space for all WeWork locations in NY, we can then visualize the results.
manhattan_office = wework_nyc_locations_df.iloc[6]
print(manhattan_office['address'])
brookly_office = wework_nyc_locations_df.iloc[0]
print(brookly_office['address'])
origin_location = brookly_office['geometry']
print(brookly_office['address'])
cf.viz.map(target_location, cc)
similar_cells_df = similarity.twin_area(origin_location)
similar_cells_df[similar_cells_df.id_type!='origin'].head()
top_locations = similar_cells_df[similar_cells_df.id_type!='origin'].iloc[:1000]
top_locations['geometry'] = [qk2mercatorpolygon(qk) for qk in list(top_locations.index)]
cc.write(top_locations, 'wework_la_top_locations', overwrite=True)
HTML('''
<iframe width="100%" height="600" frameborder="0" src="https://do-v2-demo.carto.com/builder/13c6479a-6cf3-4ab9-af7b-25e34106d1eb/embed" allowfullscreen webkitallowfullscreen mozallowfullscreen oallowfullscreen msallowfullscreen></iframe>
''')